Introduction
Reflection
Resources
setwd("D://Documents/ead/Udacity/dsNanodegree/dataAnalysisR/project/")
install.packages('kable')
install.packages("markdown", dependencies = TRUE)
install.packages("knitr", dependencies = TRUE)
install.packages("ggplot2", dependencies = TRUE)
install.packages("knitr", dependencies = TRUE)
install.packages("dplyr", dependencies = TRUE)
install.packages('GGally', dependencies = TRUE)
install.packages('tidyverse', dependencies = TRUE)
install.packages('ggthemes', dependencies = TRUE)
install.packages('corrplot', dependencies = TRUE)
install.packages('ggbiplot2', dependencies = TRUE)
install.packages('e1071', dependencies = TRUE)
install.packages('rpart', dependencies = TRUE)
install.packages('randomForest', dependencies = TRUE)
install.packages('ggfortify', dependencies = TRUE)
install.packages('ROCR', dependencies = TRUE)
library(ROCR)
library(ggfortify)
library(ggplot2)
library(knitr)
library(dplyr)
library(GGally)
library(tidyverse)
library(ggthemes)
library(gridExtra)
library(corrplot)
library(grid)
library(lattice)
library(e1071)
library(rpart)
library(randomForest)
library(knitr)
library(markdown)
library(htmlTable)
# Load the Data
wdf <- read.csv('data/wineQualityWhites.csv')
wdf$X <- NULL
We will study a white wine dataset with 4898 observations and 12 features. One of then is the quality of the wine from 0 to 10, and the others are chemical information. You can find a summary regarding the features in the table below.
dim(wdf)
## [1] 4898 12
swdf <- summary(wdf)
swdf <- t.data.frame(swdf)
write.csv(swdf, file = 'summary.csv')
| Feature | unit | Min | 1st Qu. | Median | Mean | 3rd Qu. | Max |
|---|---|---|---|---|---|---|---|
| fixed.acidity | [g(tartaric acid)/dm^3] | 3.80 | 6.30 | 6.80 | 6.86 | 7.30 | 14.20 |
| volatile.acidity | [g(acetic acid)/dm^3] | 0.08 | 0.21 | 0.26 | 0.28 | 0.32 | 1.10 |
| citric.acid | [g/dm^3] | 0.00 | 0.27 | 0.32 | 0.33 | 0.39 | 1.66 |
| residual.sugar | [g/dm^3] | 0.60 | 1.70 | 5.20 | 6.39 | 9.90 | 65.80 |
| chlorides | [g(acetic acid)/dm^3] | 0.01 | 0.04 | 0.04 | 0.05 | 0.05 | 0.35 |
| free.sulfur.dioxide | [mg/dm^3] | 2.00 | 23.00 | 34.00 | 35.31 | 46.00 | 289.00 |
| total.sulfur.dioxide | [g/cm^3] | 9.00 | 108.00 | 134.00 | 138.40 | 167.00 | 440.00 |
| density | [g/cm^3] | 0.99 | 0.99 | 0.99 | 0.99 | 1.00 | 1.04 |
| pH | 2.72 | 3.09 | 3.18 | 3.19 | 3.28 | 3.82 | |
| sulphates | [g(potassium sulphate)/dm^3] | 0.22 | 0.41 | 0.47 | 0.49 | 0.55 | 1.08 |
| alcohol | [% vol.] | 8.00 | 9.50 | 10.40 | 10.51 | 11.40 | 14.20 |
| quality | 3.00 | 5.00 | 6.00 | 5.88 | 6.00 | 9.00 |
p1 <- ggplot(aes(x=fixed.acidity), data = wdf) +
geom_bar(fill='#99CCFF')
p2 <- ggplot(aes(x=volatile.acidity), data = wdf) +
geom_bar(fill='#99CCFF')
p3 <- ggplot(aes(x=citric.acid), data = wdf) +
geom_bar(fill='#99CCFF')
p4 <- ggplot(aes(x=residual.sugar), data = wdf) +
geom_bar(fill='#99CCFF')
p5 <- ggplot(aes(x=chlorides), data = wdf) +
geom_bar(fill='#99CCFF')
p6 <- ggplot(aes(x=free.sulfur.dioxide), data = wdf) +
geom_bar(fill='#99CCFF')
p7 <- ggplot(aes(x=total.sulfur.dioxide), data = wdf) +
geom_bar(fill='#99CCFF')
p8 <- ggplot(aes(x=density), data = wdf) +
geom_bar(fill='#99CCFF')
p9 <- ggplot(aes(x=pH), data = wdf) +
geom_bar(fill='#99CCFF')
p10 <- ggplot(aes(x=sulphates), data = wdf) +
geom_bar(fill='#99CCFF')
p11 <- ggplot(aes(x=alcohol), data = wdf) +
geom_bar(fill='#99CCFF')
p12 <- ggplot(aes(x=quality), data = wdf) +
geom_bar(fill='#99CCFF')
u1 <- grid.arrange(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, ncol = 4, top = textGrob("Histograms for all features",gp=gpar(fontsize=15,font=3)))
ggsave(file = 'pictures/1_univariate.png', u1)
Histogram
Histogram for all features in the dataset. We can observe that most of the graphic have a normal distribution with a tendency for a positive skeew. The next section we will remove the outliers based on our observation on the plots below and re-plot the Histograms again.
<<<<<<< HEADwdf.outliers <- data.frame(wdf)
wdf.outliers <- subset(wdf.outliers, wdf.outliers$fixed.acidity < 11)
wdf.outliers <- subset(wdf.outliers, wdf.outliers$volatile.acidity < 0.75)
wdf.outliers <- subset(wdf.outliers, wdf.outliers$citric.acid < 1)
wdf.outliers <- subset(wdf.outliers, wdf.outliers$residual.sugar < 30)
wdf.outliers <- subset(wdf.outliers, wdf.outliers$chlorides < 0.10)
wdf.outliers <- subset(wdf.outliers, wdf.outliers$free.sulfur.dioxide < 125)
wdf.outliers <- subset(wdf.outliers, wdf.outliers$total.sulfur.dioxide < 350)
wdf.outliers <- subset(wdf.outliers, wdf.outliers$density < 1.005)
=======
wdf <- subset(wdf, wdf$fixed.acidity < 11)
wdf <- subset(wdf, wdf$volatile.acidity < 0.75)
wdf <- subset(wdf, wdf$citric.acid < 1)
wdf <- subset(wdf, wdf$residual.sugar < 30)
wdf <- subset(wdf, wdf$chlorides < 0.10)
wdf <- subset(wdf, wdf$free.sulfur.dioxide < 125)
wdf <- subset(wdf, wdf$total.sulfur.dioxide < 350)
wdf <- subset(wdf, wdf$density < 1.005)
>>>>>>> parent of 37c87eb... Plot two: Raw x Outliers PCA plot
Histogram without outliers
Histograms re-ploted without outliers.
# Using a for loop for better programming practice and to save lines of code :)
feature.list <- names(wdf)
p <- list()
a <- 0
for (var in feature.list) {
a <- a + 1
p[[a]] <- ggplot(data = wdf, aes_string(x=var)) +
geom_density(fill='#99CCFF') +
geom_vline(aes_string(xintercept=mean(wdf[, var])),
color='blue', size=0.5) +
geom_vline(aes_string(xintercept=median(wdf[, var])),
color='red', size=0.5) +
geom_vline(aes_string(xintercept=quantile(wdf[, var], 0.25)),
linetype='dashed', size=0.5) +
geom_vline(aes_string(xintercept=quantile(wdf[, var], 0.75)),
linetype='dashed', size=0.5) +
ylab(NULL)
}
ggsave(file = 'pictures/density_univariate_outliers.png', do.call(grid.arrange, p))
Density plots for each feature with stats lines
univariate density plots without outliers
In this plot and from the histograms above we can see the mean, median, 1st quantile and 3rd quantile over each density distribution and a very normal distribution for all features except for residual.sugar that have a very slightly positve skeew.
# | classification | Criterion |
# |----------------|-----------------|
# | Bad | quality < 6 |
# | Normal | quality = 6 |
# | Good | quality > 6 |
wdf$quality.2 <- ifelse(wdf$quality < 6, 'bad', ifelse(wdf$quality == 6, 'normal', 'good'))
wdf$quality.2 <- as.factor(wdf$quality.2)
table(wdf$quality.2)
##
## bad good normal
## 1565 1055 2139
wdf$quality.2 <- NULL
| Quality.2 | raw dataset | outliers removed dataset |
|---|---|---|
| bad | 1640 | 1565 |
| good | 1060 | 1055 |
| normal | 2198 | 2139 |
# Pie plot creation with percent legend
rating = as.numeric(c(1565, 2139, 1055))
percent <- round(100*rating/sum(rating), 1)
colors = c("red", "orange", "blue")
lable <- c('Bad', 'Normal', 'Good')
pie(rating, labels = lable, main = 'Wine quality Pie chart distribution', col = colors)
Pie chart
See Dataset Structure above.
The quality is the main feature of interest.
From Wine specialists the basics characteristics are:
Tannin is correlated with phenolic compounds however we don’t have this variable in our dataset, and Body is a mix from all characteristics mentioned above. Then, I will focusing on the following variables:
Yes, I created a new variable called quality.2 from quality as in the table below:
| classification | Criterion |
|---|---|
| Bad | quality < 6 |
| Normal | quality = 6 |
| Good | quality > 6 |
No. I didn’t change the original data.
ggpairs(wdf, title = 'GGPAIRS') +
theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank(),
axis.line=element_blank(), axis.text=element_blank())
Ggpairs
cor.wdf <- cor(wdf)
c <- corrplot.mixed(cor.wdf, tl.pos = 'lt', mar=c(2,0,2,0), title = 'CORRPLOT graphic')
Corrplot
Looking at the “ggpairs” and “corrplot” plots result it is easy to have a good idea about the correlations between the features. In the table below there is a list with a few pairs with a positive, negative and zero correlation. The positive correlation have 4 rows but we will discharge the number 2 because those features have similar properties.
| Positive correlation | correlation | correlation outliers | |
|---|---|---|---|
| 1 | density x residual.sugar | 0.84 | 0.84 |
| 2 | total.sulfur.dioxide x free.sulfur.dioxide | 0.62 | 0.61 |
| 3 | density x total.sulfur.dioxide | 0.53 | 0.55 |
| 4 | quality x alcohol | 0.44 | 0.44 |
| Negative correlation | correlation | correlation outliers | |
|---|---|---|---|
| 1 | density x alcohol | -0.78 | -0.81 |
| 2 | total.sulfur.dioxide x alcohol | -0.45 | -0.46 |
| 3 | alcohol x residual.sugar | -0.45 | -0.48 |
| Zero correlation | correlation | correlation outliers | |
|---|---|---|---|
| 1 | quality x citric.acid | ~ 0 | ~ 0 |
| 2 | sulphates x chlorides | ~ 0 | 0.06 |
| 3 | density x volatily.acidity | ~ 0 | -0.03 |
# Removing outliers for density and residual.sugar for better visualization
# Positive correlation plots
p1 <- ggplot(data = wdf, aes(x = density, y = residual.sugar)) +
geom_boxplot(aes(group = cut_width(quality, 1)), fill = '#99CCFF') +
scale_y_continuous(limits = c(0, 25)) +
ggtitle('density X residual.sugar')
p2 <- ggplot(data = wdf, aes(x = density, y = total.sulfur.dioxide)) +
geom_boxplot(aes(group = cut_width(quality, 1)), fill = '#99CCFF') +
ggtitle('density X total.sulfur.dioxide')
p3 <- ggplot(data = wdf, aes(x = quality, y = alcohol)) +
geom_boxplot(aes(group = cut_width(quality, 1)), fill = '#99CCFF') +
ggtitle('quality X alcohol')
# Negative correlation plots
p4 <- ggplot(data = wdf, aes(x = density, y = alcohol)) +
geom_boxplot(aes(group = cut_width(quality, 1)), fill = '#99CCFF') +
ggtitle('density X alcohol')
p5 <- ggplot(data = wdf, aes(y = total.sulfur.dioxide, x = alcohol)) +
geom_boxplot(aes(group = cut_width(quality, 1)), fill = '#99CCFF') +
ggtitle('total.sulfur.dioxide X alcohol')
p6 <- ggplot(data = wdf, aes(x = alcohol, y = residual.sugar)) +
geom_boxplot(aes(group = cut_width(quality, 1)), fill = '#99CCFF') +
scale_y_continuous(limits = c(0, 30)) +
ggtitle('alcohol X residual.sugar')
# Zero correlation
p7 <- ggplot(data = wdf, aes(x = quality, y = citric.acid)) +
geom_boxplot(aes(group = cut_width(quality, 1)), fill = '#99CCFF') +
scale_y_continuous(limits = c(0, 1)) +
ggtitle('quality X citric.acid')
p8 <- ggplot(data = wdf, aes(y = sulphates, x = chlorides)) +
geom_boxplot(aes(group = cut_width(quality, 1)), fill = '#99CCFF') +
ggtitle('sulphates X chlorides')
p9 <- ggplot(data = wdf, aes(x = density, y = volatile.acidity)) +
geom_boxplot(aes(group = cut_width(quality, 1)), fill = '#99CCFF') +
scale_y_continuous(limits = c(0, 0.9)) +
ggtitle('density X volatile.acidity')
grid1 <- grid.arrange(p1, p2, p3, ncol = 3, top = textGrob("Bivariate Boxplots with Positive Correlation",gp=gpar(fontsize=15,font=3)))
grid2 <- grid.arrange(p4, p5, p6, ncol = 3, top = textGrob("Bivariate Boxplots with Negative Correlation",gp=gpar(fontsize=15,font=3)))
grid3 <- grid.arrange(p7, p8, p9, ncol = 3, top = textGrob("Bivariate Boxplots with Zero Correlation",gp=gpar(fontsize=15,font=3)))
ggsave(file = 'pictures/1_boxplot_bivariate_outliers.png', grid1)
ggsave(file = 'pictures/2_boxplot_bivariate_outliers.png', grid2)
ggsave(file = 'pictures/3_boxplot_bivariate_outliers.png', grid3)
Bivariate Boxplots
Bivariate Boxplots
Bivariate Boxplots
# Removing outliers for better visualization
# Solution for Warning message: "Continuous x aesthetic -- did you forget aes(group=...)? "
# https://ggplot2.tidyverse.org/reference/geom_boxplot.html
# Positive correlation plots
p1 <- ggplot(data = wdf, aes(x = density, y = residual.sugar)) +
geom_point() +
geom_smooth(method='lm',formula=y~x) +
scale_x_continuous(limits = c(0.98, 1.01)) +
scale_y_continuous(limits = c(0, 30))
p2 <- ggplot(data = wdf, aes(x = density, y = total.sulfur.dioxide)) +
geom_point() +
geom_smooth(method='lm',formula=y~x) +
scale_x_continuous(limits = c(0.98, 1.01))
p3 <- ggplot(data = wdf, aes(x = quality, y = alcohol)) +
geom_point()
# Negative correlation plots
p4 <- ggplot(data = wdf, aes(x = density, y = alcohol)) +
geom_point() +
geom_smooth(method='lm',formula=y~x) +
scale_x_continuous(limits = c(0.98, 1.01))
p5 <- ggplot(data = wdf, aes(y = total.sulfur.dioxide, x = alcohol)) +
geom_point() +
geom_smooth(method='lm',formula=y~x)
p6 <- ggplot(data = wdf, aes(x = alcohol, y = residual.sugar)) +
geom_point() +
geom_smooth(method='lm',formula=y~x) +
scale_y_continuous(limits = c(0, 30))
# Zero correlation
p7 <- ggplot(data = wdf, aes(x = quality, y = citric.acid)) +
geom_point()
p8 <- ggplot(data = wdf, aes(y = sulphates, x = chlorides)) +
geom_point() +
geom_smooth(method='lm',formula=y~x) +
scale_x_continuous(limits = c(0, 0.2))
p9 <- ggplot(data = wdf, aes(x = density, y = volatile.acidity)) +
geom_point() +
geom_smooth(method='lm',formula=y~x) +
scale_y_continuous(limits = c(0, 0.9)) +
scale_x_continuous(limits = c(0.98, 1.01))
grid1 <- grid.arrange(p1, p2, p3, ncol = 3, top = textGrob("Bivariate Scatter plot with Positive Correlation",gp=gpar(fontsize=15,font=3)))
grid2 <- grid.arrange(p4, p5, p6, ncol = 3, top = textGrob("Bivariate Scatter plot with Negative Correlation",gp=gpar(fontsize=15,font=3)))
grid3 <- grid.arrange(p7, p8, p9, ncol = 3, top = textGrob("Bivariate Scatter plot with Zero Correlation",gp=gpar(fontsize=15,font=3)))
ggsave(file = 'pictures/1_scatterplot_bivariate_outliers.png', grid1)
ggsave(file = 'pictures/2_scatterplot_bivariate_outliers.png', grid2)
ggsave(file = 'pictures/3_scatterplot_bivariate_outliers.png', grid3)
Positive COrrelation Bivariate Scatterplot
Negative Correlation Bivariate Scatterplot
Zero Correlation Bivariate Scatterplot
In the density_X_residual.sugar plot we can observe a strong correlation. The amount of wines with lower density and residual.sugar is bigger than high values. In density_X_total.sulfur.dioxide the plot have a eclipse format where the amount of wines have a concentration in the midle for both features but fot total.sulfur.dioxide they are spread much more.
In the density_X_alcohol plot we can see the expected behaivour once the alcohol density is lower than the water density. For both second and third plots they have a negative tendency nad for alcohol_X_residual.sugar they are a concentration of wines with lower residual.sugar quantities.
Those plots we can check what was expected from the correlation value. They don’t have any strong tendency.
No.
| Positive correlation | correlation | correlation outliers | |
|---|---|---|---|
| 1 | density x residual.sugar | 0.84 | 0.84 |
| 2 | total.sulfur.dioxide x free.sulfur.dioxide | 0.62 | 0.61 |
| 3 | density x total.sulfur.dioxide | 0.53 | 0.55 |
| 4 | quality x alcohol | 0.44 | 0.44 |
In this table we can see the correlation using a raw dataset and the dataset without oultliers we can’t see difference between them. We can disconsidering total.sulfur.dioxide x free.sulfur.dioxide beacuse one is part of the others.
We can confirm the correlation between density and residual.sugar in the ggpairs plot and linear regression line over the previous one. I all of the 4 multivariate graphics we can observe a concentration for good wines instead normal and bad ones.
# Loading wdf.raw dataset
wdf.raw <- read.csv('data/wineQualityWhites.csv')
wdf.raw$X <- NULL
wdf.raw$quality <- as.factor(wdf$quality)
# Creating train.raw and test.raw datasets
set.seed(13)
samp <- sample(nrow(wdf.raw), 0.4 * nrow(wdf.raw))
train.raw <- wdf.raw[samp, ]
test.raw <- wdf.raw[-samp, ]
# Loading wdf.outliers dataset
wdf.outliers <- data.frame(wdf.raw)
wdf.outliers <- subset(wdf.outliers, wdf.outliers$fixed.acidity < 11)
wdf.outliers <- subset(wdf.outliers, wdf.outliers$volatile.acidity < 0.75)
wdf.outliers <- subset(wdf.outliers, wdf.outliers$citric.acid < 1)
wdf.outliers <- subset(wdf.outliers, wdf.outliers$residual.sugar < 30)
wdf.outliers <- subset(wdf.outliers, wdf.outliers$chlorides < 0.10)
wdf.outliers <- subset(wdf.outliers, wdf.outliers$free.sulfur.dioxide < 125)
wdf.outliers <- subset(wdf.outliers, wdf.outliers$total.sulfur.dioxide < 350)
wdf.outliers <- subset(wdf.outliers, wdf.outliers$density < 1.005)
# Creating Train.outliers and Test.outliers datasets
set.seed(13)
samp <- sample(nrow(wdf.outliers), 0.4 * nrow(wdf.outliers))
train.outliers <- wdf.outliers[samp, ]
test.outliers <- wdf.outliers[-samp, ]
# Random Forest Raw
df.list <- list(data.frame(train.raw), data.frame(test.raw))
a <- 0
while (a < 10){
a <- a + 1
model <- randomForest(quality ~ . - quality, data = train.raw)
rf.pred <- predict(model, newdata = test.raw)
acc.rf <- classAgreement(table(rf.pred, test.raw$quality))[1]
time <- Sys.time()
str <- paste('rf', time, as.numeric(acc.rf), sep = ',')
write(str, file = 'data/accuracy_models.csv', append = TRUE)
}
# Random Forest Raw
df.list <- list(data.frame(train.outliers), data.frame(test.outliers))
a <- 0
while (a < 10){
a <- a + 1
model <- randomForest(quality ~ . - quality, data = train.outliers)
rf.pred <- predict(model, newdata = test.outliers)
acc.rf <- classAgreement(table(rf.pred, test.outliers$quality))[1]
time <- Sys.time()
str <- paste('rf.out', time, as.numeric(acc.rf), sep = ',')
write(str, file = 'data/accuracy_models.csv', append = TRUE)
}
# Tune with raw dataset
obj.raw <- tune.svm(quality~., data = train.raw, gamma = 2^(-1:1),cost = 2^(2:4))
summary(obj.raw)
# tune with outliers dataset
obj.outliers <- tune.svm(quality~., data = train.outliers, gamma = 2^(-1:1),cost = 2^(2:4))
summary(obj.outliers)
SVM tune raw
SVM tune outliers
# SVM raw
df.list <- list(data.frame(train.raw), data.frame(test.raw))
a <- 0
while (a < 10){
a <- a + 1
svm.model <- svm(quality ~ ., data = train.raw, cost = 4, gamma = 0.5)
svm.pred <- predict(svm.model,test.raw[,-12])
acc.svm <- classAgreement(table(svm.pred, test.raw$quality))[1]
time <- Sys.time()
str <- paste('svm', time, as.numeric(acc.svm), sep = ',')
write(str, file = 'data/accuracy_models.csv', append = TRUE)
}
# SVM outliers
df.list <- list(data.frame(train.outliers), data.frame(test.outliers))
a <- 0
while (a < 10){
a <- a + 1
svm.model <- svm(quality ~ ., data = train.outliers, cost = 4, gamma = 0.5)
svm.pred <- predict(svm.model,test.outliers[,-12])
acc.svm <- classAgreement(table(svm.pred, test.outliers$quality))[1]
time <- Sys.time()
str <- paste('svm.out', time, as.numeric(acc.svm), sep = ',')
write(str, file = 'data/accuracy_models.csv', append = TRUE)
}
# rpart raw
df.list <- list(data.frame(train.raw), data.frame(test.raw))
a <- 0
while (a < 10){
a <- a + 1
rpart.model <- rpart(quality ~ ., data = train.raw)
rpart.pred <- predict(rpart.model, test.raw[,-12], type = 'class')
table(rpart.pred,test.raw[,12])
classAgreement(table(pred = rpart.pred,true = test.raw[,12]))
acc.rpart <- classAgreement(table(rpart.pred, test.raw$quality))[1]
time <- Sys.time()
str <- paste('rpart', time, as.numeric(acc.rpart), sep = ',')
write(str, file = 'data/accuracy_models.csv', append = TRUE)
}
# rpart outliers
df.list <- list(data.frame(train.outliers), data.frame(test.outliers))
a <- 0
while (a < 10){
a <- a + 1
rpart.model <- rpart(quality ~ ., data = train.outliers)
rpart.pred <- predict(rpart.model, test.outliers[,-12], type = 'class')
table(rpart.pred,test.outliers[,12])
classAgreement(table(pred = rpart.pred,true = test.outliers[,12]))
acc.rpart <- classAgreement(table(rpart.pred, test.outliers$quality))[1]
time <- Sys.time()
str <- paste('rpart.out', time, as.numeric(acc.rpart), sep = ',')
write(str, file = 'data/accuracy_models.csv', append = TRUE)
}
# load the dataset
acc.df <- read.csv(file = 'data/accuracy_models.csv')
# Random Forest
mean.acc.rf <- mean(subset(acc.df, acc.df$model == 'rf')$accuracy)
mean.acc.rf <- format(mean.acc.rf, digits = 4)
mean.acc.rf <- as.numeric(mean.acc.rf)*100
mean.acc.rf.out <- mean(subset(acc.df, acc.df$model == 'rf.out')$accuracy)
mean.acc.rf.out <- format(mean.acc.rf.out, digits = 4)
mean.acc.rf.out <- as.numeric(mean.acc.rf.out)*100
# SVM
mean.acc.svm <- mean(subset(acc.df, acc.df$model == 'svm')$accuracy)
mean.acc.svm <- format(mean.acc.svm, digits = 4)
mean.acc.svm <- as.numeric(mean.acc.svm)*100
mean.acc.svm.out <- mean(subset(acc.df, acc.df$model == 'svm.out')$accuracy)
mean.acc.svm.out <- format(mean.acc.svm.out, digits = 4)
mean.acc.svm.out <- as.numeric(mean.acc.svm.out)*100
# Rpart
mean.acc.rpart <- mean(subset(acc.df, acc.df$model == 'rpart')$accuracy)
mean.acc.rpart <- format(mean.acc.rpart, digits = 4)
mean.acc.rpart <- as.numeric(mean.acc.rpart)*100
mean.acc.rpart.out <- mean(subset(acc.df, acc.df$model == 'rpart.out')$accuracy)
mean.acc.rpart.out <- format(mean.acc.rpart.out, digits = 4)
mean.acc.rpart.out <- as.numeric(mean.acc.rpart.out)*100
htmlTable(acc.matrix,
css.cell = ("padding-left: 1em; padding-right: 1em;"),
header = c('Random Forest', 'SVM', 'Rpart'),
rnames = c('RAW Data', 'Outliers Data'),
caption="Accuracy Mean (%)")
## Error in htmlTable(acc.matrix, css.cell = ("padding-left: 1em; padding-right: 1em;"), : object 'acc.matrix' not found
To better understand how the features are correlated I ran a PCA algorithm and I did the plot for the PCA Components and a graphic with Variance x Number of Component to decide how many Components I will use in the following models.
# PCA
wdf.pca <- prcomp(wdf.outliers[,1:11], center = TRUE, scale. = TRUE)
# Variance plot
plot(wdf.pca, type = "l")
abline(h=0.55, v=8, col="blue")
# PCA components plots
g <- autoplot(wdf.pca, loadings = TRUE, loadings.colour = 'blue', loadings.label = TRUE, loadings.label.size = 5, alpha = 0.3, main = 'PCA')
ggsave(file = 'pictures/pca_outliers.png', g)
PCA Variance
Looking at this graphic it is easy to indentify that we can run the models with 8 components without loosing accuracy in our model.
PCA components
In this graphic we can confirm some teories about the correlation between the variables. For example we expect negative correlation with alcohol and residual.sugar once the alcohol needs sugar to be produced. pH and citric.acid and fixed.acidity also need to have negative correlation once the lowest pH means very acid solution.
# New dataframe
<<<<<<< HEAD
new.wdf.pca <- data.frame(wdf.pca$x)
new.wdf.pca <- data.frame(new.wdf.pca[,1:8], quality = wdf.outliers$quality)
# Creating train and test datasets
set.seed(123)
samp <- sample(nrow(new.wdf.pca), 0.8 * nrow(new.wdf.pca))
new.train.pca <- new.wdf.pca[samp, ]
new.test.pca <- new.wdf.pca[-samp, ]
=======
new_wdf.pca <- data.frame(wdf.pca$x)
new_wdf.pca <- data.frame(new_wdf.pca[,1:8], quality = wdf$quality)
# Creating train and test datasets
set.seed(123)
samp <- sample(nrow(wdf), 0.8 * nrow(wdf))
train.pca <- new_wdf.pca[samp, ]
test.pca <- new_wdf.pca[-samp, ]
>>>>>>> parent of 37c87eb... Plot two: Raw x Outliers PCA plot
# Random Forest Raw
df.list <- list(data.frame(new.train.pca), data.frame(new.test.pca))
a <- 0
while (a < 10){
a <- a + 1
model <- randomForest(quality ~ . - quality, data = new.train.pca, ntree = 150)
rf.pred <- predict(model, newdata = new.test.pca)
acc.rf <- classAgreement(table(rf.pred, new.test.pca$quality))[1]
time <- Sys.time()
str <- paste('rf', time, as.numeric(acc.rf), sep = ',')
write(str, file = 'data/accuracy_models_pca.csv', append = TRUE)
}
# load the dataset
acc.df <- read.csv(file = 'data/accuracy_models_pca.csv')
# Random Forest
mean.acc.rf <- mean(subset(acc.df, acc.df$model == 'rf')$accuracy)
mean.acc.rf <- format(mean.acc.rf, digits = 4)
mean.acc.rf <- as.numeric(mean.acc.rf)*100
htmlTable(mean.acc.rf,
css.cell = ("padding-left: 1em; padding-right: 1em;"),
<<<<<<< HEAD
header = c('Random Forest'),
rnames = c('Outliers Data'),
caption="Accuracy Mean (%)")
Accuracy Mean (%)
=======
header = c('Random Forest', 'SVM', 'Rpart'),
rnames = c('RAW Data', 'PCA Data'),
caption="Accuracy Mean")
We could increase the accuracy in more than 4% with Random Forest and PCA. Final Plots and SummaryPlot OneDescription OnePlot Two<<<<<<< HEAD
Raw x Outliers PCA plot Description TwoPlot ThreeDescription ThreeReflectionThe models I made didn’t perform as well as I expected. The accuracy with PCA was a good choice to try to eliminate useless information in the dataset and we could see in the results some accuracy improvement. Removing the outliers we got a better result with and without PCA of around 3%. General resourcesArticles
R<<<<<<< HEADRemove the column X from Dataframe: https://stackoverflow.com/questions/6286313/remove-an-entire-column-from-a-data-frame-in-r/30620946 Remove the column X from Dataframe: https://stackoverflow.com/questions/6286313/remove-an-entire-column-from-a-data-frame-in-r/30620946 Style and Markdown tools and cheatsheetMarktable generator: https://www.tablesgenerator.com/markdown_tables GGplot, GGPAIRS, Corrplot and other Visualizations tips and tricksColors in ggplot: http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/ PCA and Machine Learning ModelsPCA: https://www.r-bloggers.com/computing-and-visualizing-pca-in-r/ | |||||||||